In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import warnings
import enclus
warnings.filterwarnings('ignore')
sc.set_figure_params(dpi=300,fontsize=10,dpi_save=300,facecolor='white',format='pdf')
sc.settings.figdir = './Result/hypothalamic/'
uniPort¶
In [24]:
import uniport as up
print(up.__version__)
st_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_st_data.h5ad')
sc_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_sc_data.h5ad')
sc_data.obs_names_make_unique()
st_data.obs_names_make_unique()
sc_data.var_names_make_unique()
sc_data.var_names_make_unique()
st_data.obs['domain_id'] = 0
st_data.obs['domain_id'] = st_data.obs['domain_id'].astype('category')
st_data.obs['source'] = 'MERFISH'
sc_data.obs['domain_id'] = 1
sc_data.obs['domain_id'] = sc_data.obs['domain_id'].astype('category')
sc_data.obs['source'] = 'RNA'
sc.pp.normalize_total(st_data)
sc.pp.log1p(st_data)
sc.pp.highly_variable_genes(st_data, n_top_genes=2000, inplace=True, subset=True)
up.batch_scale(st_data)
sc.pp.normalize_total(sc_data)
sc.pp.log1p(sc_data)
sc.pp.highly_variable_genes(sc_data, n_top_genes=2000, inplace=True, subset=True)
up.batch_scale(sc_data)
print(sc_data,st_data)
# st_data.write_h5ad('./datasets/ProcessedMERFISH_hypothalamic_st_data.h5ad')
# sc_data.write_h5ad('./datasets/ProcessedMERFISH_hypothalamic_sc_data.h5ad')
1.2.2
AnnData object with n_obs × n_vars = 30370 × 2000
obs: 'cell_type', 'domain_id', 'source'
var: 'Gene', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'log1p', 'hvg' AnnData object with n_obs × n_vars = 64373 × 155
obs: 'cell_type', 'domain_id', 'source'
var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'log1p', 'hvg'
obsm: 'spatial'
In [6]:
adata_merfish = sc.read('./datasets/ProcessedMERFISH_hypothalamic_st_data.h5ad')
adata_rna = sc.read('./datasets/ProcessedMERFISH_hypothalamic_sc_data.h5ad')
adata_cm = sc.read('./datasets/MERFISH_hypothalamic_cm_data.h5ad')
In [ ]:
adata = up.Run(adatas=[adata_merfish, adata_rna], adata_cm=adata_cm, lambda_kl=5.0)
Device: cpu
Dataset 0: MERFISH
AnnData object with n_obs × n_vars = 64373 × 155
obs: 'cell_type', 'domain_id', 'source'
var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'log1p', 'hvg'
obsm: 'spatial'
Dataset 1: RNA
AnnData object with n_obs × n_vars = 30370 × 2000
obs: 'cell_type', 'domain_id', 'source'
var: 'Gene', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'log1p', 'hvg'
Reference dataset is dataset 1
Data with common HVG
AnnData object with n_obs × n_vars = 94743 × 154
obs: 'cell_type', 'domain_id', 'source'
var: 'Gene-1'
uns: 'log1p'
Epochs: 100%|██████████| 82/82 [45:23<00:00, 33.22s/it, recloss=791.54,klloss=29.45,otloss=4.24]
In [11]:
sc.pp.neighbors(adata, use_rep='latent')
sc.tl.umap(adata, min_dist=0.1)
sc.pl.umap(adata, color=['source', 'cell_type'],save='uniport-integrate.pdf',wspace=0.3,legend_fontsize=10)
WARNING: saving figure to file Result\hypothalamic\umapuniport-integrate.pdf
In [ ]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = adata[adata.obs['domain_id']=='0']
adata2 = adata[adata.obs['domain_id']=='1']
y_test = enclus.metrics.label_transfer(adata2, adata1, label='cell_type', rep='X_umap')
f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(adata.obsm['X_umap'], adata.obs['domain_id']))
print('Silhouette:', enclus.metrics.silhouette(adata.obsm['X_umap'], adata.obs['cell_type']))
F1: 0.871461070953648 ARI: 0.7429336420387456 NMI: 0.732073753207167 total score: 2.3464684661995605 Batch Entropy: 0.47998895178474116 Silhouette: 0.693193793296814
MultiMAP¶
In [3]:
st_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_st_data.h5ad')
sc_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_sc_data.h5ad')
sc_data.obs_names_make_unique()
st_data.obs_names_make_unique()
sc_data.var_names_make_unique()
sc_data.var_names_make_unique()
st_data.obs['domain_id'] = 0
st_data.obs['domain_id'] = st_data.obs['domain_id'].astype('category')
st_data.obs['source'] = 'MERFISH'
sc_data.obs['domain_id'] = 1
sc_data.obs['domain_id'] = sc_data.obs['domain_id'].astype('category')
sc_data.obs['source'] = 'RNA'
sc.pp.normalize_total(st_data)
sc.pp.log1p(st_data)
sc.pp.normalize_total(sc_data)
sc.pp.log1p(sc_data)
sc.pp.scale(st_data)
sc.pp.pca(st_data)
sc.pp.scale(sc_data)
sc.pp.pca(sc_data)
In [4]:
sc_data,st_data
Out[4]:
(AnnData object with n_obs × n_vars = 30370 × 27998
obs: 'cell_type', 'domain_id', 'source'
var: 'Gene', 'mean', 'std'
uns: 'log1p', 'pca'
obsm: 'X_pca'
varm: 'PCs',
AnnData object with n_obs × n_vars = 64373 × 155
obs: 'cell_type', 'domain_id', 'source'
var: 'mean', 'std'
uns: 'log1p', 'pca'
obsm: 'spatial', 'X_pca'
varm: 'PCs')
In [5]:
from integration.MultiMAP.model import *
adata = Integration([st_data, sc_data], ['X_pca', 'X_pca'])
In [ ]:
sc.pl.embedding(adata, 'X_multimap', color=['source','cell_type'],save='MultiMAP-integrate.pdf',wspace=0.3,legend_fontsize=10)
WARNING: saving figure to file Result\hypothalamic\X_multimapMultiMAP-integrate.pdf
d:\Program Files\Anaconda3\envs\integrate\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( d:\Program Files\Anaconda3\envs\integrate\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
In [ ]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = adata[adata.obs['domain_id']==0]
adata2 = adata[adata.obs['domain_id']==1]
y_test = enclus.metrics.label_transfer(adata2, adata1, label='cell_type', rep='X_multimap')
f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(adata.obsm['X_multimap'], adata.obs['domain_id']))
print('Silhouette:', enclus.metrics.silhouette(adata.obsm['X_multimap'], adata.obs['cell_type']))
F1: 0.8808136163876433 ARI: 0.7628832260492341 NMI: 0.7679373319525135 total score: 2.4116341743893908 Batch Entropy: 0.514425433948172 Silhouette: 0.6702606976032257
gimVI¶
In [2]:
st_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_st_data.h5ad')
sc_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_sc_data.h5ad')
sc_data.obs_names_make_unique()
st_data.obs_names_make_unique()
sc_data.var_names_make_unique()
sc_data.var_names_make_unique()
st_data.obs['domain_id'] = 0
st_data.obs['domain_id'] = st_data.obs['domain_id'].astype('category')
st_data.obs['source'] = 'MERFISH'
sc_data.obs['domain_id'] = 1
sc_data.obs['domain_id'] = sc_data.obs['domain_id'].astype('category')
sc_data.obs['source'] = 'RNA'
sc.pp.normalize_total(st_data)
sc.pp.log1p(st_data)
sc.pp.normalize_total(sc_data)
sc.pp.log1p(sc_data)
In [3]:
from baseline.scvi.model import GIMVI
import baseline.scvi as scvi
train_size = 0.8
# only use genes in both datasets
common_gene = np.intersect1d(st_data.var_names,sc_data.var_names)
st_data = st_data[:,common_gene]
seq_data = sc_data[:, common_gene]
seq_gene_names = seq_data.var_names
n_genes = seq_data.n_vars
n_train_genes = int(n_genes * train_size)
# randomly select training_genes
rand_train_gene_idx = np.random.choice(range(n_genes), n_train_genes, replace=False)
rand_test_gene_idx = sorted(set(range(n_genes)) - set(rand_train_gene_idx))
rand_train_genes = seq_gene_names[rand_train_gene_idx]
rand_test_genes = seq_gene_names[rand_test_gene_idx]
# spatial_data_partial has a subset of the genes to train on
spatial_data_partial = st_data[:, rand_train_genes].copy()
# remove cells with no counts
sc.pp.filter_cells(spatial_data_partial, min_counts=1)
sc.pp.filter_cells(seq_data, min_counts=1)
# setup_anndata for spatial and sequencing data
scvi.data.setup_anndata(spatial_data_partial)
scvi.data.setup_anndata(seq_data)
# spatial_data should use the same cells as our training data
# cells may have been removed by scanpy.pp.filter_cells()
spatial_data = st_data[spatial_data_partial.obs_names]
INFO 'scvi' logger already has a StreamHandler, set its level to 20.
In [4]:
model = GIMVI(seq_data, spatial_data_partial)
model.train(max_epochs=200)
Training...: 100%|██████████| 200/200 [31:44<00:00, 9.52s/it]
In [5]:
seq_data,st_data,spatial_data_partial
Out[5]:
(AnnData object with n_obs × n_vars = 30367 × 154
obs: 'cell_type', 'domain_id', 'source', 'n_counts', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
var: 'Gene'
uns: 'log1p', '_scvi',
View of AnnData object with n_obs × n_vars = 64373 × 154
obs: 'cell_type', 'domain_id', 'source'
uns: 'log1p'
obsm: 'spatial',
AnnData object with n_obs × n_vars = 64372 × 123
obs: 'cell_type', 'domain_id', 'source', 'n_counts', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
uns: 'log1p', '_scvi'
obsm: 'spatial')
In [8]:
import anndata
# get the latent representations for the sequencing and spatial data
latent_seq, latent_spatial = model.get_latent_representation()
# concatenate to one latent representation
latent_representation = np.concatenate([latent_spatial,latent_seq])
latent_adata = anndata.AnnData(latent_representation)
# labels which cells were from the sequencing dataset and which were from the spatial dataset
latent_labels =(["MERFISH"] * latent_spatial.shape[0]) + (["RNA"] * latent_seq.shape[0])
latent_adata.obs["labels"] = latent_labels
latent_adata.obs["cell_type"] = spatial_data_partial.obs['cell_type'].to_list()+seq_data.obs['cell_type'].to_list()
latent_adata.obs["domain_id"] =(["0"] * latent_spatial.shape[0]) + (["1"] * latent_seq.shape[0])
# compute umap
sc.pp.neighbors(latent_adata, use_rep="X")
sc.tl.umap(latent_adata)
# save umap representations to original seq and spatial_datasets
seq_data.obsm["X_umap"] = latent_adata.obsm["X_umap"][: seq_data.shape[0]]
spatial_data.obsm["X_umap"] = latent_adata.obsm["X_umap"][seq_data.shape[0] :]
In [3]:
# umap of the combined latent space
sc.pl.umap(latent_adata, color=['labels', 'cell_type'],save='gimVI-integrate.pdf',wspace=0.3,legend_fontsize=10)
WARNING: saving figure to file Result\hypothalamic\umapgimVI-integrate.pdf
In [10]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = latent_adata[latent_adata.obs['domain_id']=='0']
adata2 = latent_adata[latent_adata.obs['domain_id']=='1']
y_test = enclus.metrics.label_transfer(adata2, adata1, label='cell_type', rep='X_umap')
f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(latent_adata.obsm['X_umap'], latent_adata.obs['domain_id']))
print('Silhouette:', enclus.metrics.silhouette(latent_adata.obsm['X_umap'], latent_adata.obs['cell_type']))
F1: 0.6458704940597764 ARI: 0.625984661312957 NMI: 0.6066587759777841 total score: 1.8785139313505175 Batch Entropy: 0.5944166602436534 Silhouette: 0.5858384892344475
SCALEX¶
In [2]:
import scalex
from scalex.function import SCALEX
from scalex.plot import embedding
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
In [17]:
adata_cm = sc.read('./datasets/MERFISH_hypothalamic_cm_data.h5ad')
adata_cm.obs['batch'] = adata_cm.obs['source']
adata_cm.write_h5ad('./datasets/scalex_hypo_adata.h5ad')
wk_dir = './datasets/'
In [23]:
adata = SCALEX(data_list = [wk_dir+'scalex_hypo_adata.h5ad'],
min_features=0,
min_cells=0,
outdir='./Result/hypothalamic/',
show=False,
gpu=0)
2025-03-12 23:31:13,175 - SCALEX - INFO - Raw dataset shape: (94743, 154) 2025-03-12 23:31:13,175 - SCALEX - INFO - Raw dataset shape: (94743, 154) 2025-03-12 23:31:13,177 - SCALEX - INFO - There are 2 batches under batch_name: batch 2025-03-12 23:31:13,177 - SCALEX - INFO - There are 2 batches under batch_name: batch 2025-03-12 23:31:13,178 - SCALEX - INFO - Preprocessing 2025-03-12 23:31:13,178 - SCALEX - INFO - Preprocessing 2025-03-12 23:31:13,186 - SCALEX - INFO - Filtering cells 2025-03-12 23:31:13,186 - SCALEX - INFO - Filtering cells 2025-03-12 23:31:13,365 - SCALEX - INFO - Filtering features 2025-03-12 23:31:13,365 - SCALEX - INFO - Filtering features 2025-03-12 23:31:13,497 - SCALEX - INFO - Normalizing total per cell 2025-03-12 23:31:13,497 - SCALEX - INFO - Normalizing total per cell 2025-03-12 23:31:13,526 - SCALEX - INFO - Log1p transforming 2025-03-12 23:31:13,526 - SCALEX - INFO - Log1p transforming
WARNING: adata.X seems to be already log-transformed.
2025-03-12 23:31:13,552 - SCALEX - INFO - Finding variable features 2025-03-12 23:31:13,552 - SCALEX - INFO - Finding variable features 2025-03-12 23:31:13,789 - SCALEX - INFO - Batch specific maxabs scaling 2025-03-12 23:31:13,789 - SCALEX - INFO - Batch specific maxabs scaling 2025-03-12 23:31:14,986 - SCALEX - INFO - Processed dataset shape: (94743, 154) 2025-03-12 23:31:14,986 - SCALEX - INFO - Processed dataset shape: (94743, 154) Epochs: 100%|██████████| 21/21 [06:54<00:00, 19.73s/it, recon_loss=58.260,kl_loss=2.497] 2025-03-12 23:38:20,754 - SCALEX - INFO - Plot umap 2025-03-12 23:38:20,754 - SCALEX - INFO - Plot umap
WARNING: saving figure to file Result\hypothalamic\umap.png
In [26]:
sc.set_figure_params(dpi=300,fontsize=10,dpi_save=300,facecolor='white',format='pdf')
sc.pl.umap(adata, color=['batch', 'cell_type'],save='SCALEX-integrate.pdf',wspace=0.3,legend_fontsize=10)
WARNING: saving figure to file Result\hypothalamic\umapSCALEX-integrate.pdf
In [30]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = adata[adata.obs['domain_id']=='0']
adata2 = adata[adata.obs['domain_id']=='1']
y_test = enclus.metrics.label_transfer(adata2, adata1, label='cell_type', rep='X_scalex_umap')
f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(adata.obsm['X_scalex_umap'], adata.obs['domain_id']))
print('Silhouette:', enclus.metrics.silhouette(adata.obsm['X_scalex_umap'], adata.obs['cell_type']))
F1: 0.7510572103225998 ARI: 0.5705643773958746 NMI: 0.5728128818126889 total score: 1.8944344695311632 Batch Entropy: 0.5145418383234401 Silhouette: 0.6278813630342484
scVI¶
In [11]:
import anndata
import scanpy as sc
import scvi
import warnings
warnings.filterwarnings('ignore')
sc.set_figure_params(dpi=300,fontsize=10,dpi_save=300,facecolor='white',format='pdf')
sc.settings.figdir = './Result/hypothalamic/'
In [12]:
st_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_st_data.h5ad')
sc_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_sc_data.h5ad')
sc_data.obs_names_make_unique()
st_data.obs_names_make_unique()
sc_data.var_names_make_unique()
sc_data.var_names_make_unique()
st_data.obs['domain_id'] = 0
st_data.obs['domain_id'] = st_data.obs['domain_id'].astype('category')
st_data.obs['source'] = 'MERFISH'
sc_data.obs['domain_id'] = 1
sc_data.obs['domain_id'] = sc_data.obs['domain_id'].astype('category')
sc_data.obs['source'] = 'RNA'
sc.pp.normalize_total(st_data)
sc.pp.log1p(st_data)
sc.pp.normalize_total(sc_data)
sc.pp.log1p(sc_data)
In [29]:
adata = anndata.concat([st_data,sc_data])
scvi.model.SCVI.setup_anndata(adata, batch_key="source")
scvi_model = scvi.model.SCVI(adata, n_layers=2, n_latent=30)
scvi_model.train()
GPU available: False, used: False TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs
Epoch 1/84: 0%| | 0/84 [00:00<?, ?it/s]
Epoch 84/84: 100%|██████████| 84/84 [10:42<00:00, 7.90s/it, v_num=1, train_loss_step=67.9, train_loss_epoch=74.5]
`Trainer.fit` stopped: `max_epochs=84` reached.
Epoch 84/84: 100%|██████████| 84/84 [10:42<00:00, 7.65s/it, v_num=1, train_loss_step=67.9, train_loss_epoch=74.5]
In [30]:
SCVI_LATENT_KEY = "X_scVI"
adata.obsm[SCVI_LATENT_KEY] = scvi_model.get_latent_representation()
sc.pp.neighbors(adata, use_rep=SCVI_LATENT_KEY)
sc.tl.umap(adata, min_dist=0.1)
In [31]:
sc.pl.umap(adata, color=['source', 'cell_type'],save='scvi-integrate.pdf',wspace=0.3,legend_fontsize=10)
WARNING: saving figure to file Result\hypothalamic\umapscvi-integrate.pdf
In [37]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = adata[adata.obs['domain_id']==0]
adata2 = adata[adata.obs['domain_id']==1]
y_test = enclus.metrics.label_transfer(adata2, adata1, label='cell_type', rep='X_umap')
f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(adata.obsm['X_umap'], adata.obs['domain_id']))
print('Silhouette:', enclus.metrics.silhouette(adata.obsm['X_umap'], adata.obs['cell_type']))
F1: 0.817277553437705 ARI: 0.6490055920337885 NMI: 0.6642281616875901 total score: 2.1305113071590838 Batch Entropy: 0.4367643613852369 Silhouette: 0.6226593926548958
Seurat¶
In [ ]:
# Export unstandardized data to R for processing
adata = sc.read('./datasets/MERFISH_hypothalamic_cm_data.h5ad')
adata.to_df().T.to_csv('expression_matrix.csv')
adata.obs['source'].to_csv('batch_info.csv')
adata.obs['cell_type'].to_csv('cell_type_info.csv')
In [6]:
import pandas as pd
import numpy as np
import anndata
# read embedding
embedding = pd.read_csv("./Result/hypothalamic/integrated_cca_embedding.csv", index_col=0)
# read metadata
metadata = pd.read_csv("./Result/hypothalamic/metadata.csv")
metadata.index = metadata["Cell"] # Set the index as the cell name
metadata = metadata.drop(columns=["Cell"])
# buid AnnData
adata = anndata.AnnData(X=embedding)
adata.obs = metadata # Add batch and cell type information
print(adata)
print(adata.obs.head())
AnnData object with n_obs × n_vars = 94743 × 50
obs: 'source', 'cell_type'
source cell_type
Cell
X6749ccb4.2ed1.4029.968f.820a287f43c8.0 MERFISH Astrocytes
X6cac74bd.4ea7.4701.8701.42563cc65eb8.0 MERFISH Inhibitory
X9f29bd57.16a5.4b26.b9f5.37598809da9e.0 MERFISH Inhibitory
d7eb4e0b.276e.47e3.a55c.0b033180a2fe.0 MERFISH Inhibitory
X54434f3a.eba9.4aec.af35.c9d317ffa1d5.0 MERFISH Inhibitory
In [4]:
def label_transfer(ref, query, label='celltype'):
from sklearn.neighbors import KNeighborsClassifier
X_train = ref.X
y_train = ref.obs[label]
X_test = query.X
knn = knn = KNeighborsClassifier().fit(X_train, y_train)
y_test = knn.predict(X_test)
return y_test
In [8]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = adata[adata.obs['source']=='MERFISH']
adata2 = adata[adata.obs['source']=='RNA']
y_test = label_transfer(adata2, adata1, label='cell_type')
f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(adata.X, adata.obs['source']))
print('Silhouette:', enclus.metrics.silhouette(adata.X, adata.obs['cell_type']))
F1: 0.8091003178716528 ARI: 0.6508844103152466 NMI: 0.6594581930237735 total score: 2.119442921210673 Batch Entropy: 0.29372671828102115 Silhouette: 0.5137207407478872
Harmony¶
In [9]:
import pandas as pd
import numpy as np
import anndata
embedding = pd.read_csv("./Result/hypothalamic/integrated_harmony_embedding.csv", index_col=0)
metadata = pd.read_csv("./Result/hypothalamic/metadata.csv")
metadata.index = metadata["Cell"]
metadata = metadata.drop(columns=["Cell"])
adata = anndata.AnnData(X=embedding)
adata.obs = metadata
print(adata)
print(adata.obs.head())
AnnData object with n_obs × n_vars = 94743 × 50
obs: 'source', 'cell_type'
source cell_type
Cell
X6749ccb4.2ed1.4029.968f.820a287f43c8.0 MERFISH Astrocytes
X6cac74bd.4ea7.4701.8701.42563cc65eb8.0 MERFISH Inhibitory
X9f29bd57.16a5.4b26.b9f5.37598809da9e.0 MERFISH Inhibitory
d7eb4e0b.276e.47e3.a55c.0b033180a2fe.0 MERFISH Inhibitory
X54434f3a.eba9.4aec.af35.c9d317ffa1d5.0 MERFISH Inhibitory
In [10]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = adata[adata.obs['source']=='MERFISH']
adata2 = adata[adata.obs['source']=='RNA']
y_test = label_transfer(adata2, adata1, label='cell_type')
f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(adata.X, adata.obs['source']))
print('Silhouette:', enclus.metrics.silhouette(adata.X, adata.obs['cell_type']))
F1: 0.8003951568856976 ARI: 0.6026170137527536 NMI: 0.6698873201850263 total score: 2.0728994908234775 Batch Entropy: 0.15340537230811968 Silhouette: 0.49228478616875704